package au.com.acpfg.misc.uniprot; import java.io.File; import java.io.IOException; import java.util.ArrayList; import org.knime.core.data.DataCell; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.RowIterator; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModel; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelColumnName; import org.knime.core.node.defaultnodesettings.SettingsModelIntegerBounded; import org.knime.core.node.defaultnodesettings.SettingsModelNumber; import org.knime.core.node.defaultnodesettings.SettingsModelString; /** * This is the model implementation of UniProtAccessor. Accesses the UniProt * data source (via webservices) * * @author Andrew Cassin */ public class UniProtAccessorNodeModel extends NodeModel { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(UniProtAccessorNodeModel.class); /** * the settings key which is used to retrieve and store the settings (from * the dialog or from a settings file) (package visibility to be usable from * the dialog). */ static final String CFGKEY_TASK = "task"; static final String CFGKEY_ACCSN_COL = "accsn-col"; static final String CFGKEY_UNIREF = "uniref-db"; static final String CFGKEY_FROM_ACCSN= "from-accsn"; static final String CFGKEY_TO_ACCSN = "to-accsn"; static final String CFGKEY_WANTXML = "want-xml"; static final String CFGKEY_CACHE = "cache-results"; static final String CFGKEY_CACHE_FRESHNESS = "cache-freshness"; static final String CFGKEY_CACHE_FILENAME = "cache-filename"; /** initial defaults for configured parameters */ private static final String DEFAULT_TASK = "Retrieve UniProt Entries"; private static final String DEFAULT_ACCSN_COL = "Accession"; // example value: the models count variable filled from the dialog // and used in the models execution method. The default components of the // dialog work with "SettingsModels". private final SettingsModelString m_task = make_as_string(CFGKEY_TASK); private final SettingsModelColumnName m_accsn_col= (SettingsModelColumnName) make(CFGKEY_ACCSN_COL); private final SettingsModelString m_uniref_db = make_as_string(CFGKEY_UNIREF); private final SettingsModelString m_from_accsn = make_as_string(CFGKEY_FROM_ACCSN); private final SettingsModelString m_to_accsn = make_as_string(CFGKEY_TO_ACCSN); private final SettingsModelBoolean m_want_xml = (SettingsModelBoolean) make(CFGKEY_WANTXML); private final SettingsModelBoolean m_cache = (SettingsModelBoolean) make(CFGKEY_CACHE); private final SettingsModelString m_cache_file = make_as_string(CFGKEY_CACHE_FILENAME); private final SettingsModelNumber m_cache_freshness = (SettingsModelNumber) make(CFGKEY_CACHE_FRESHNESS); /** * Constructor for the node model. */ protected UniProtAccessorNodeModel() { super(1, 1); } protected static SettingsModel make(String key) { if (key.equals(CFGKEY_TASK)) return new SettingsModelString(CFGKEY_TASK, DEFAULT_TASK); else if (key.equals(CFGKEY_ACCSN_COL)) return new SettingsModelColumnName(CFGKEY_ACCSN_COL, DEFAULT_ACCSN_COL); else if (key.equals(CFGKEY_UNIREF)) { SettingsModelString sms = new SettingsModelString(CFGKEY_UNIREF, "UniRef100"); sms.setEnabled(false); return sms; } else if (key.equals(CFGKEY_FROM_ACCSN)) { SettingsModelString sms2 = new SettingsModelString(CFGKEY_FROM_ACCSN, "UniProt"); sms2.setEnabled(false); return sms2; } else if (key.equals(CFGKEY_TO_ACCSN)) { SettingsModelString sms3 = new SettingsModelString(CFGKEY_TO_ACCSN, "TAIR"); sms3.setEnabled(false); return sms3; } else if (key.equals(CFGKEY_WANTXML)) { SettingsModelBoolean b = new SettingsModelBoolean(CFGKEY_WANTXML, false); return b; } else if (key.equals(CFGKEY_CACHE)) { SettingsModelBoolean b = new SettingsModelBoolean(CFGKEY_CACHE, true); // NB: since true is the default for b, the other cache parameters can default to setEnabled(true) ie. default constructed return b; } else if (key.equals(CFGKEY_CACHE_FRESHNESS)) { SettingsModelNumber n = new SettingsModelIntegerBounded(key, 180, 0, 2000); return n; } else if (key.equals(CFGKEY_CACHE_FILENAME)) { try { File f = File.createTempFile("uniprot-cache", ".db4o"); return new SettingsModelString(CFGKEY_CACHE_FILENAME, f.getAbsolutePath()); } catch (Exception e) { return new SettingsModelString(CFGKEY_CACHE_FILENAME, ""); } } return null; } protected static SettingsModelString make_as_string(String key) { return (SettingsModelString) make(key); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { // 1. go thru the rows handling exceptions from the task object UniProtTaskInterface up = null; try { boolean use_rid = m_accsn_col.useRowID(); int accsn_col_idx = -1; if (!use_rid) { accsn_col_idx = inData[0].getDataTableSpec().findColumnIndex(m_accsn_col.getStringValue()); if (accsn_col_idx < 0) throw new Exception("Cannot find column: "+m_accsn_col.getStringValue()+" - reset the node?"); } // setup the columns to hold the data // inData[0] is the data to be matched, inData[1] is the strings to use // for matching (or RE's) DataTableSpec spec = inData[0].getDataTableSpec(); int n_rows = inData[0].getRowCount(); int done_rows = 0; int n_hits = 0; String task = m_task.getStringValue(); if (task.equals(DEFAULT_TASK)) { up = new RetrieveEntryTask(this, "/uniprot/"); } else if (task.equals("Retrieve UniRef Entries")) { up = new UniRefEntryTask(this, m_uniref_db.getStringValue()); } else if (task.startsWith("Retrieve UniPARC")) { up = new UniPARCEntryTask(this, "/uniparc/"); } else if (task.startsWith("Map")) { up = new AccessionMapTask(m_from_accsn.getStringValue(), m_to_accsn.getStringValue()); } else { throw new InvalidSettingsException("Unsupported task: "+task); } DataTableSpec outputSpec = new DataTableSpec("UniProt appended data", spec, up.getTableSpec(m_want_xml.getBooleanValue())); BufferedDataContainer container = exec.createDataContainer(outputSpec); // run the necessary queries in batches of up to m_batch_size rows ie. // accessions each RowIterator it = inData[0].iterator(); int batch_cnt = 0; int batch_size= 20; // batch data structures ArrayList<String> batch_accsns = new ArrayList<String>(); ArrayList<DataRow> batch_rows = new ArrayList<DataRow>(); // main loop while (it.hasNext()) { DataRow r = it.next(); String accsn = use_rid ? get_accsn(r, use_rid) : get_accsn(r, accsn_col_idx); if (accsn == null || accsn.length() < 1) { done_rows++; continue; } // batch up the current row into internal data structures if (batch_cnt < batch_size) { String final_accsn = up.fix_accsn(accsn); if (final_accsn != null && final_accsn.length() > 0) { batch_accsns.add(final_accsn); batch_rows.add(r); batch_cnt++; } done_rows++; } if (batch_cnt == batch_size) { exec.checkCanceled(); exec.setProgress(((double) done_rows) / n_rows, "Fetching batch, from: "+batch_accsns.get(0)+ " (size "+batch_accsns.size()+")"); batch_run(up, batch_accsns, batch_rows, container); batch_accsns.clear(); batch_rows.clear(); batch_cnt = 0; // sleep for 20s between each batch (be nice to EBI facilities if they are being used ie. not cached) up.pause(exec, ((double) done_rows) / n_rows, "Pausing for 20sec. (to be nice to UniProt servers)"); } } // NB: dont forget the last batch (probably not a multiple of batch_size)! batch_run(up, batch_accsns, batch_rows, container); // finalise output for the node... container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[] { out }; } catch (Exception e) { logger.error(e.getMessage()); e.printStackTrace(); throw e; } finally { // and ensure that any cleanup code is given a chance to run... regardless of exceptions if (up != null) up.cleanup(); } } /** * Responsible for returning an accession for the specified row. If you call this method, * <code>use_rid</code> must be true or an assertion will fail. Ugly implementation. * * @param r * @param use_rid * @returns the row ID */ protected String get_accsn(DataRow r, boolean use_rid) { assert(use_rid); String rid = r.getKey().getString().trim(); return rid; } /** * Returns the accession from the specified column * @param r * @param accsn_col_idx * @return */ protected String get_accsn(DataRow r, int accsn_col_idx) { assert(accsn_col_idx >= 0); DataCell c = r.getCell(accsn_col_idx); if (c == null || c.isMissing()) { return null; } String accsn = c.toString().trim(); return accsn; } /** * Process the entire batch of rows (each with a corresponding accsn) and add the results into the specified * container for the UniProt task object to run. The batch can be of arbitrary size, but most tasks will only * process them one at a time. * * @param up * @param batch_accsns * @param batch_rows * @param container * @return * @throws Exception */ private int batch_run(UniProtTaskInterface up, ArrayList<String> batch_accsns, ArrayList<DataRow> batch_rows, BufferedDataContainer container) throws Exception { String[] accsns = batch_accsns.toArray(new String[0]); if (accsns.length < 1) return 0; logger.debug("Running batch of accessions: "+accsns[0]+" - "+accsns[accsns.length-1]); return up.run(accsns, batch_rows.toArray(new DataRow[0]), container); } /** * {@inheritDoc} */ @Override protected void reset() { } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return new DataTableSpec[] { null }; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_task.saveSettingsTo(settings); m_accsn_col.saveSettingsTo(settings); m_uniref_db.saveSettingsTo(settings); m_from_accsn.saveSettingsTo(settings); m_to_accsn.saveSettingsTo(settings); m_want_xml.saveSettingsTo(settings); m_cache.saveSettingsTo(settings); m_cache_file.saveSettingsTo(settings); m_cache_freshness.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_task.loadSettingsFrom(settings); m_accsn_col.loadSettingsFrom(settings); m_uniref_db.loadSettingsFrom(settings); m_from_accsn.loadSettingsFrom(settings); m_to_accsn.loadSettingsFrom(settings); m_want_xml.loadSettingsFrom(settings); m_cache.loadSettingsFrom(settings); m_cache_file.loadSettingsFrom(settings); m_cache_freshness.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_task.validateSettings(settings); m_accsn_col.validateSettings(settings); m_uniref_db.validateSettings(settings); m_from_accsn.validateSettings(settings); m_to_accsn.validateSettings(settings); m_want_xml.validateSettings(settings); m_cache.validateSettings(settings); m_cache_file.validateSettings(settings); m_cache_freshness.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * Has the user requested to cache files? * @return */ public boolean isCaching() { return m_cache.getBooleanValue(); } /** * Returns the File instance representing the cache file, or <code>null</code> if the user does not want caching of results */ public File getCacheFile() { if (!isCaching()) return null; return new File(m_cache_file.getStringValue()); } /** * Returns the number of days old which a cache entry is to be considered current, or <code>-1</code> if not */ public int getCacheFreshness() { if (!isCaching()) return -1; return ((SettingsModelIntegerBounded)m_cache_freshness).getIntValue(); } }